Differential expression analysis of LCM RNA Data

Managing Packages Using Renv

To run this code in my project using the renv environment, run the following lines of code

install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile

Load packages

library("genefilter")
library("DESeq2")
library("apeglm")
library("ashr")
library("ggplot2")
library("vsn")
library("hexbin")
library("pheatmap")
library("RColorBrewer")
library("EnhancedVolcano")
library("rtracklayer")
library("tidyverse")
library("circlize")

sessionInfo() #provides list of loaded packages and version of R.
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] circlize_0.4.16             lubridate_1.9.4            
##  [3] forcats_1.0.1               stringr_1.6.0              
##  [5] dplyr_1.1.4                 purrr_1.2.0                
##  [7] readr_2.1.5                 tidyr_1.3.1                
##  [9] tibble_3.3.0                tidyverse_2.0.0            
## [11] rtracklayer_1.70.0          EnhancedVolcano_1.20.0     
## [13] ggrepel_0.9.6               RColorBrewer_1.1-3         
## [15] pheatmap_1.0.13             hexbin_1.28.5              
## [17] vsn_3.70.0                  ggplot2_4.0.0              
## [19] ashr_2.2-63                 apeglm_1.24.0              
## [21] DESeq2_1.50.0               SummarizedExperiment_1.40.0
## [23] Biobase_2.70.0              MatrixGenerics_1.22.0      
## [25] matrixStats_1.5.0           GenomicRanges_1.62.0       
## [27] Seqinfo_1.0.0               IRanges_2.44.0             
## [29] S4Vectors_0.48.0            BiocGenerics_0.56.0        
## [31] generics_0.1.4              genefilter_1.92.0          
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9             DBI_1.2.3                rlang_1.1.6             
##  [4] magrittr_2.0.4           compiler_4.5.2           RSQLite_2.4.3           
##  [7] png_0.1-8                vctrs_0.6.5              shape_1.4.6.1           
## [10] pkgconfig_2.0.3          crayon_1.5.3             fastmap_1.2.0           
## [13] XVector_0.50.0           Rsamtools_2.26.0         rmarkdown_2.30          
## [16] tzdb_0.5.0               preprocessCore_1.72.0    bit_4.6.0               
## [19] xfun_0.54                cachem_1.1.0             cigarillo_1.0.0         
## [22] jsonlite_2.0.0           blob_1.2.4               DelayedArray_0.36.0     
## [25] BiocParallel_1.44.0      irlba_2.3.5.1            parallel_4.5.2          
## [28] R6_2.6.1                 stringi_1.8.7            bslib_0.9.0             
## [31] SQUAREM_2021.1           limma_3.66.0             jquerylib_0.1.4         
## [34] numDeriv_2016.8-1.1      Rcpp_1.1.0               knitr_1.50              
## [37] timechange_0.3.0         Matrix_1.7-4             splines_4.5.2           
## [40] tidyselect_1.2.1         rstudioapi_0.17.1        dichromat_2.0-0.1       
## [43] abind_1.4-8              yaml_2.3.10              codetools_0.2-20        
## [46] affy_1.88.0              curl_7.0.0               lattice_0.22-7          
## [49] plyr_1.8.9               withr_3.0.2              KEGGREST_1.50.0         
## [52] S7_0.2.0                 coda_0.19-4.1            evaluate_1.0.5          
## [55] survival_3.8-3           Biostrings_2.78.0        pillar_1.11.1           
## [58] affyio_1.80.0            BiocManager_1.30.26      renv_1.1.5              
## [61] RCurl_1.98-1.17          invgamma_1.2             truncnorm_1.0-9         
## [64] emdbook_1.3.14           hms_1.1.4                scales_1.4.0            
## [67] xtable_1.8-4             glue_1.8.0               tools_4.5.2             
## [70] BiocIO_1.20.0            GenomicAlignments_1.46.0 annotate_1.88.0         
## [73] locfit_1.5-9.12          mvtnorm_1.3-3            XML_3.99-0.19           
## [76] grid_4.5.2               bbmle_1.0.25.1           bdsmatrix_1.3-7         
## [79] colorspace_2.1-2         AnnotationDbi_1.72.0     restfulr_0.0.16         
## [82] cli_3.6.5                mixsqp_0.3-54            S4Arrays_1.10.0         
## [85] gtable_0.3.6             sass_0.4.10              digest_0.6.37           
## [88] SparseArray_1.10.1       rjson_0.2.23             farver_2.1.2            
## [91] memoise_2.0.1            htmltools_0.5.8.1        lifecycle_1.0.4         
## [94] httr_1.4.7               GlobalOptions_0.1.2      statmod_1.5.1           
## [97] bit64_4.6.0-1            MASS_7.3-65
#set standard output directory for figures
outdir <- "../output_RNA/differential_expression"

save_ggplot <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300, bg = "white") {
  print(plot)

  png_path <- file.path(outdir, paste0(filename, ".png"))
  pdf_dir <- file.path(outdir, "pdf_figs")
  pdf_path <- file.path(pdf_dir, paste0(filename, ".pdf"))
  
  # Ensure the pdf_figs directory exists
  if (!dir.exists(pdf_dir)) dir.create(pdf_dir, recursive = TRUE)
  
  # Save plots
  ggsave(filename = png_path, plot = plot, width = width, height = height, units = units, dpi = dpi,bg = bg)
  ggsave(filename = pdf_path, plot = plot, width = width, height = height, units = units, dpi = dpi,bg = bg)
}

# Specify colors
ann_colors = list(Tissue = c(OralEpi = "#FD8D3C" ,Aboral = "mediumpurple1"))

Read and clean count matrix and metadata

Read in raw count data

counts_raw <- read.csv("../output_RNA/stringtie-GeneExt/LCM_RNA_gene_count_matrix.csv", row.names = 1) #load in data

gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9

Read in metadata

meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv") %>%
            dplyr::arrange(Sample) %>%
            mutate(across(c(Tissue, Fragment, Section_Date, LCM_Date), factor)) # Set variables as factors 

meta$Tissue <- factor(meta$Tissue, levels = c("Aboral","OralEpi")) #we want Aboral to be the baseline

Data sanity checks!

stopifnot(all(meta$Sample %in% colnames(counts_raw))) #are all of the sample names in the metadata column names in the gene count matrix?
stopifnot(all(meta$Sample == colnames(counts_raw))) #are they the same in the same order?

pOverA filtering to reduce dataset

ffun<-filterfun(pOverA(0.49,10))  # Keep genes expressed at 10+ counts in at least 50% of samples
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter

filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter

paste0("Number of genes after filtering: ", sum(counts_filt_poa))
## [1] "Number of genes after filtering: 14464"
write.csv(filtered_counts, file = file.path(outdir, "filtered_counts.csv"))

There are now 14464 genes in the filtered dataset.

Data sanity checks:

all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts))  #are they the same in the same order? Should be TRUE
## [1] TRUE

DESeq2

Create DESeq object and run DESeq2

dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = meta,
                              design= ~ Fragment + Tissue)

dds <- DESeq(dds)

### Extract results for Aboral vs. OralEpi contrast

res <- results(dds, contrast = c("Tissue","OralEpi","Aboral"))
resLFC <- lfcShrink(dds, coef="Tissue_OralEpi_vs_Aboral", res=res, type = "apeglm")

Extract results for adjusted p-value < 0.05

res <- resLFC

resOrdered <- res[order(res$pvalue),]# save differentially expressed genes

DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05 & abs(log2FoldChange) > 1)
DE_05_Up <- DE_05 %>% filter(log2FoldChange < 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange > 0) #Lower in Aboral, Higher in OralEpi

nrow(DE_05)
## [1] 1805
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 552
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 1253
write.csv(as.data.frame(resOrdered), 
          file = file.path(outdir, "DESeq_results.csv"))

write.csv(DE_05, 
          file = file.path(outdir, "DEG_05.csv"))

Visualizing Differential Expression

EnhancedVolcano(res, 
    lab = ifelse(res$padj < 0.05, rownames(res), ""),
    x = "log2FoldChange", 
    shape=16,
    y = "pvalue"
)

volcano_plain <- EnhancedVolcano(res, 
    lab = NA,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    title="",
    subtitle="",
    drawConnectors = TRUE,
    shape=16,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(volcano_plain, "volcano_plain",width = 4, height = 6, units = "in", dpi = 300)

Plots

plotMA(results(dds, contrast = c("Tissue","Aboral","OralEpi")), ylim=c(-20,20))

plotMA(res, ylim=c(-20,20))

Log2 Fold Change Comparison

resultsNames(dds)
## [1] "Intercept"                "Fragment_B_vs_A"         
## [3] "Fragment_C_vs_A"          "Fragment_D_vs_A"         
## [5] "Fragment_E_vs_A"          "Tissue_OralEpi_vs_Aboral"
# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef="Tissue_OralEpi_vs_Aboral", type="normal")
resAsh <- lfcShrink(dds, coef="Tissue_OralEpi_vs_Aboral", type="ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-20,20)
plotMA(res, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")

plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")

Transforming count data for visualization

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)

meanSdPlot(assay(vsd), main = "vsd")

meanSdPlot(assay(rld))

meanSdPlot(assay(ntd))

#save the vsd transformation
vsd_mat <- assay(vsd)
write.csv(vsd_mat, file = file.path(outdir, "vsd_expression_matrix.csv"))

Will move forward with vst transformation for visualizations

Heatmap of count matrix

heatmap_metadata <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view most significantly differentially expressed genes

select <- order(res$padj)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2, annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Principal component plot of the samples

pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE, ntop = 14464)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "#FD8D3C"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "PCA_allgenes")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "#FD8D3C"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0(outdir,"/PCA_allgenes_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "#FD8D3C"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "PCA")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "#FD8D3C"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0(outdir,"/PCA_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)

Clearly, the majority of the variance in the data is explained by tissue type!

Annotation data

Download annotation files from genome website


# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)

CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)

KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())

write.csv(as.data.frame(DE_05_eggnog), file=paste0(outdir,"/DE_05_eggnog_annotation.csv"))

annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)

eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
  separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
  
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
top50_DE

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_Aboral

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_OralEpi

Genes of Interest

Single cell marker genes (Levy et al 2021)

MarkerGenes <- read.csv("../references/Pacuta_MarkerGenes_Levy2021.csv") %>% dplyr::rename("query" = 1, "List" = 2, "definition" = 3) %>% filter(List !="Toolkit")

MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Spis_Markers_pairs.csv") %>% select(protein_id_spB,cluster,Standardized_Name_spA ) %>% dplyr::rename("query" = 1, "List" = 2)

MarkerGenes$def_short <- ifelse(nchar(MarkerGenes$definition) > 20, 
                            paste0(substr(MarkerGenes$definition, 1, 17), "..."), 
                            MarkerGenes$definition)

Biomineralization toolkit

Biomin_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Biomin_Spis_ortholog.csv") %>% dplyr::rename("query" = Pacuta_gene) #%>% select(-c(X,accessionnumber_gene_id, ref))

Biomin_broc <- Biomin_broc %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","),Classification = paste(unique(Classification), collapse = ",")) 

Biomin_broc$def_short <- ifelse(nchar(Biomin_broc$definition) > 40, 
                            paste0(substr(Biomin_broc$definition, 1, 37), "..."), 
                            Biomin_broc$definition)

Biomin_broc_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin_broc$query),]

Additional gene lists

He_etal_Nvec = He, S., Shao, W., Chen, S. (Cynthia), Wang, T. and Gibson, M. C. (2023). Spatial transcriptomics reveals a cnidarian segment polarity program in Nematostella vectensis. Current Biology 33, 2678-2689.e5.

DuBuc_etal_Nvec = DuBuc, T. Q., Stephenson, T. B., Rock, A. Q. and Martindale, M. Q. (2018). Hox and Wnt pattern the primary body axis of an anthozoan cnidarian before gastrulation. Nat Commun 9, 2007.

  • oral/aboral patterning markers in nematostella
He_etal_Nvec <- read.csv("../output_RNA/marker_genes/He_etal_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))

He_etal_Nvec$def_short <- gsub("Homeobox protein", "Hox", He_etal_Nvec$Description, ignore.case = TRUE)

DuBuc_etal_Nvec <- read.csv("../output_RNA/marker_genes/Wnt_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))
DuBuc_etal_Nvec$def_short <- DuBuc_etal_Nvec$Gene_Name

HeatStressGenes <- read.csv("../output_RNA/marker_genes/HeatStressGenes_Pacuta.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))
DE_05$query <- rownames(DE_05)
resOrdered$query <- rownames(resOrdered)

join_genes_of_interest <- function(df, gene_set) {
  df %>%
    left_join(gene_set, by = "query") %>%
    select(query, everything()) %>%
    drop_na() %>% distinct()
}

DE_05_Biomin_broc    <- join_genes_of_interest(DE_05, Biomin_broc)
DE_05_marker         <- join_genes_of_interest(DE_05, MarkerGenes)
DE_05_marker_broc    <- join_genes_of_interest(DE_05, MarkerGenes_broc)
DE_05_He_etal        <- join_genes_of_interest(DE_05, He_etal_Nvec)
DE_05_DuBuc_etal     <- join_genes_of_interest(DE_05, DuBuc_etal_Nvec)

DESeq_Biomin_broc    <- join_genes_of_interest(as.data.frame(resOrdered), Biomin_broc)
DESeq_marker         <- join_genes_of_interest(as.data.frame(resOrdered), MarkerGenes)
DESeq_marker_broc    <- join_genes_of_interest(as.data.frame(resOrdered), MarkerGenes_broc)
DESeq_He_etal        <- join_genes_of_interest(as.data.frame(resOrdered), He_etal_Nvec)
DESeq_DuBuc_etal     <- join_genes_of_interest(as.data.frame(resOrdered), DuBuc_etal_Nvec)

DE_05_HeatStressGenes <- HeatStressGenes %>% left_join(DE_05, by="query") %>% select(query,everything()) %>% drop_na(baseMean)
DESeq_HeatStressGenes <- HeatStressGenes %>% left_join(as.data.frame(resOrdered), by="query") %>% select(query,everything())  %>% drop_na(baseMean)

write.csv(as.data.frame(DESeq_Biomin_broc), file=paste0(outdir,"/DESeq_biomin_annotation.csv"))
write.csv(as.data.frame(DE_05_Biomin_broc), file=paste0(outdir,"/DE_05_biomin_annotation.csv"))
write.csv(as.data.frame(DE_05_marker), file=paste0(outdir,"/DE_05_markergene_annotation.csv"))

Biomin_broc_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin_broc) 
Biomin_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(Biomin_broc) 

markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
markers_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(MarkerGenes) 

broc_markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 
broc_markers_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 

Trying to annotate un-annotated DE genes:

# wget protein sequence reference file
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz

#get the names of all the DEGs from the first column of the DEG csv file
tail -n +2 ../output_RNA/differential_expression/DEG_05.csv | cut -d',' -f1 | tr -d '"' > ../output_RNA/differential_expression/DEG_05_names.csv

#grep this file against the protein fasta file, first with wc -l to make sure the number of lines is correct (should be your number of DEGs)
grep -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa | wc -l

#grep each header with the protein sequence after ("-A 1") and save to a new file
grep -A 1 -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa \
  | grep -v "^--$" > ../output_RNA/differential_expression/DEG_05_seqs.txt

### all expressed genes 
  
#get the names of all the DEGs from the first column of the DEG csv file
tail -n +2 ../output_RNA/differential_expression/DESeq_results.csv | cut -d',' -f1 | tr -d '"' > ../output_RNA/differential_expression/expressed_genes_names.csv

#grep this file against the protein fasta file, first with wc -l to make sure the number of lines is correct (should be your number of DEGs)
grep -f ../output_RNA/differential_expression/expressed_genes_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa | wc -l

#grep each header with the protein sequence after ("-A 1") and save to a new file
grep -A 1 -f ../output_RNA/differential_expression/expressed_genes_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa \
  | grep -v "^--$" > ../output_RNA/differential_expression/expressed_genes_seqs.txt

SwissProt of ALL genes

On unity:

swissprot based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

mkdir ../references/blast_dbs
cd ../references/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_10_02.fasta.gz
gunzip -k uniprot_sprot_r2024_10_02.fasta.gz
rm uniprot_sprot_r2024_10_02.fasta.gz

head uniprot_sprot_r2024_10_02.fasta
echo "Number of Sequences"
grep -c ">" uniprot_sprot_r2024_10_02.fasta
# 572214 sequences
module load blast-plus/2.14.1

makeblastdb \
-in ../references/blast_dbs/uniprot_sprot_r2024_10_02.fasta \
-dbtype prot \
-out ../references/blast_dbs/uniprot_sprot_r2024_10_02
cd ../scripts
nano blastp_SwissProt.sh
#!/bin/bash
#SBATCH -t 18:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --mem=500GB
#SBATCH --export=NONE
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH -D /project/pi_hputnam_uri_edu/zdellaert/LaserCoral #set working directory

module load blast-plus/2.14.1

cd references/
mkdir annotation

fasta="Pocillopora_acuta_HIv2.genes.pep.faa"

blastp \
-query $fasta \
-db blast_dbs/uniprot_sprot_r2024_10_02 \
-out annotation/blastp_SwissProt_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

echo "Blast complete!" $(date)
cd references/annotation/

tr '|' '\t' <  blastp_SwissProt_out.tab >  blastp_SwissProt_out_sep.tab
cd ../references/annotation/

curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo%2Cgo_id%2Cgo_c%2Cgo_f&format=tsv&query=%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_111524.tsv

wc -l SwissProt-Annot-GO_111524.tsv
#572215

All code below based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

bltabl <- read.csv("../references/annotation/blastp_SwissProt_out_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../references/annotation/SwissProt-Annot-GO_111524.tsv", sep = '\t', header = TRUE)
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
  select(
    query = V1,
    blast_hit = V3,
    perc_ident = V5,
    evalue = V13,
    ProteinNames = Protein.names,
    BiologicalProcess = Gene.Ontology..biological.process.,
    GeneOntologyIDs = Gene.Ontology.IDs,
    CellularComponent = Gene.Ontology..cellular.component.,
    MolecularFunction = Gene.Ontology..molecular.function.,
  )

head(annot_tab)
##                                        query blast_hit perc_ident    evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a    Q4JAI4     31.494  1.02e-37
## 2  Pocillopora_acuta_HIv2___RNAseq.g18333.t1    O08807     79.897 9.62e-116
## 3   Pocillopora_acuta_HIv2___RNAseq.g7985.t1    O74212     50.572 3.56e-158
## 4      Pocillopora_acuta_HIv2___TS.g15308.t1    Q09575     29.327  1.08e-12
## 5   Pocillopora_acuta_HIv2___RNAseq.g2057.t1    P0C1P0     42.623  8.81e-14
## 6   Pocillopora_acuta_HIv2___RNAseq.g4696.t1    Q9W2Q5     51.852  8.98e-69
##                                                                                                                                                                                                     ProteinNames
## 1                                                                                                                                              Methionine synthase (EC 2.1.1.-) (Homocysteine methyltransferase)
## 2 Peroxiredoxin-4 (EC 1.11.1.24) (Antioxidant enzyme AOE372) (Peroxiredoxin IV) (Prx-IV) (Thioredoxin peroxidase AO372) (Thioredoxin-dependent peroxide reductase A0372) (Thioredoxin-dependent peroxiredoxin 4)
## 3                                                                                                   Acyl-lipid (8-3)-desaturase (EC 1.14.19.30) (Delta(5) fatty acid desaturase) (Delta-5 fatty acid desaturase)
## 4                                                                                                                                                                                Uncharacterized protein K02A2.6
## 5                                                                              Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y (Phosphatidylinositol-glycan biosynthesis class Y protein) (PIG-Y)
## 6                                                                                                                                                                   Calcium and integrin-binding family member 2
##                                                                                                                                                                                                                                                                                                                                                                                                                   BiologicalProcess
## 1                                                                                                                                                                                                                                                                                                                                                            methionine biosynthetic process [GO:0009086]; methylation [GO:0032259]
## 2 cell redox homeostasis [GO:0045454]; extracellular matrix organization [GO:0030198]; hydrogen peroxide catabolic process [GO:0042744]; male gonad development [GO:0008584]; negative regulation of male germ cell proliferation [GO:2000255]; protein maturation by protein folding [GO:0022417]; reactive oxygen species metabolic process [GO:0072593]; response to oxidative stress [GO:0006979]; spermatogenesis [GO:0007283]
## 3                                                                                                                                                                                                                                                                                                                 long-chain fatty acid biosynthetic process [GO:0042759]; unsaturated fatty acid biosynthetic process [GO:0006636]
## 4                                                                                                                                                                                                                                                                                                                                                                                                      DNA integration [GO:0015074]
## 5                                                                                                                                                                                                                                                                                                                                                                                      GPI anchor biosynthetic process [GO:0006506]
## 6                                                                                                                                                                                                                                                                                                                                                              calcium ion homeostasis [GO:0055074]; phototransduction [GO:0007602]
##                                                                                                                                                                                                          GeneOntologyIDs
## 1                                                                                                                                                                         GO:0003871; GO:0008270; GO:0009086; GO:0032259
## 2 GO:0005615; GO:0005737; GO:0005739; GO:0005783; GO:0005790; GO:0005829; GO:0006979; GO:0007283; GO:0008379; GO:0008584; GO:0022417; GO:0030198; GO:0042744; GO:0042802; GO:0045454; GO:0072593; GO:0140313; GO:2000255
## 3                                                                                                                                                 GO:0006636; GO:0016020; GO:0020037; GO:0042759; GO:0046872; GO:0102866
## 4                                                                                                                                                 GO:0003676; GO:0005737; GO:0008270; GO:0015074; GO:0019899; GO:0042575
## 5                                                                                                                                                                                                 GO:0000506; GO:0006506
## 6                                                                                                                                                             GO:0000287; GO:0005509; GO:0005737; GO:0007602; GO:0055074
##                                                                                                                                                                           CellularComponent
## 1                                                                                                                                                                                          
## 2 cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum [GO:0005783]; extracellular space [GO:0005615]; mitochondrion [GO:0005739]; smooth endoplasmic reticulum [GO:0005790]
## 3                                                                                                                                                                     membrane [GO:0016020]
## 4                                                                                                                               cytoplasm [GO:0005737]; DNA polymerase complex [GO:0042575]
## 5                                                                                               glycosylphosphatidylinositol-N-acetylglucosaminyltransferase (GPI-GnT) complex [GO:0000506]
## 6                                                                                                                                                                    cytoplasm [GO:0005737]
##                                                                                                                    MolecularFunction
## 1        5-methyltetrahydropteroyltriglutamate-homocysteine S-methyltransferase activity [GO:0003871]; zinc ion binding [GO:0008270]
## 2 identical protein binding [GO:0042802]; molecular sequestering activity [GO:0140313]; thioredoxin peroxidase activity [GO:0008379]
## 3        di-homo-gamma-linolenate delta5 desaturase activity [GO:0102866]; heme binding [GO:0020037]; metal ion binding [GO:0046872]
## 4                                      enzyme binding [GO:0019899]; nucleic acid binding [GO:0003676]; zinc ion binding [GO:0008270]
## 5                                                                                                                                   
## 6                                                               calcium ion binding [GO:0005509]; magnesium ion binding [GO:0000287]
rm(spgo)
# potenital additional cutoffs if desired
#annot_tab <- annot_tab %>% filter(perc_ident >= 25)
#annot_tab <- annot_tab %>% filter(evalue < 1e-10)

write.table(annot_tab, 
            file = "../references/annotation/protein-GO.tsv", 
            sep = "\t", 
            row.names = FALSE, 
            quote = FALSE)

DESeq_SwissProt <- as.data.frame(resOrdered) %>% left_join(annot_tab) %>% select(query,everything()) 
DE_05_SwissProt <- DESeq_SwissProt %>% filter(query %in% DE_05$query)

write.csv(as.data.frame(DESeq_SwissProt), file=paste0(outdir,"/DESeq_SwissProt_annotation.csv"))
write.csv(as.data.frame(DE_05_SwissProt), file=paste0(outdir,"/DE_05_SwissProt_annotation.csv"))
DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 30, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 27), "..."), 
                            DE_05_SwissProt$ProteinNames)

gene_labels <- DE_05_SwissProt %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"  
##  [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"  
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1" 
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"     
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1" 
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1" 
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1" 
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"  
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1" 
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"     
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2" 
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1" 
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE

#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [2] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [6] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [12] "Pocillopora_acuta_HIv2___RNAseq.g22261.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [14] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [15] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [16] "Pocillopora_acuta_HIv2___TS.g4983.t1"      
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [19] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [20] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [21] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [22] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [23] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [25] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [26] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [27] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [28] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [29] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [30] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [31] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [32] "Pocillopora_acuta_HIv2___RNAseq.g8062.t1"  
## [33] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [34] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [35] "Pocillopora_acuta_HIv2___RNAseq.g9631.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [37] "Pocillopora_acuta_HIv2___RNAseq.g7627.t1"  
## [38] "Pocillopora_acuta_HIv2___RNAseq.g1126.t1"  
## [39] "Pocillopora_acuta_HIv2___RNAseq.g1102.t1"  
## [40] "Pocillopora_acuta_HIv2___RNAseq.g14484.t1" 
## [41] "Pocillopora_acuta_HIv2___TS.g16384.t1"     
## [42] "Pocillopora_acuta_HIv2___RNAseq.g21373.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g13561.t1" 
## [44] "Pocillopora_acuta_HIv2___RNAseq.g9210.t1"  
## [45] "Pocillopora_acuta_HIv2___TS.g25577.t1a"    
## [46] "Pocillopora_acuta_HIv2___RNAseq.g829.t1"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g11659.t1" 
## [48] "Pocillopora_acuta_HIv2___TS.g1968.t2"      
## [49] "Pocillopora_acuta_HIv2___TS.g22105.t1"     
## [50] "Pocillopora_acuta_HIv2___RNAseq.g27681.t1b"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
print(up_OralEpi)

### Expand biomineralization gene list

Biomin_broc_orig <- Biomin_broc
broc_groups <- data.table::fread("../../multi-sp-snRNA/reference_genes/cnidarian_marker_genes/output/broccoli/dir_step3/table_OGs_protein_names.txt", header=T) %>% select(c("#OG_name","Pacu_proteins.faa"))  %>% filter(`Pacu_proteins.faa` != "") %>% rename(OG_name = "#OG_name")
broc_groups_sep <- broc_groups %>% separate_longer_delim(cols = Pacu_proteins.faa, delim = " ")
broc_groups_sep <- broc_groups_sep %>% rename(query = Pacu_proteins.faa)

head(broc_groups)
head(broc_groups_sep)
# 30744 genes are in the orthogroups 

Biomin_broc_orig_broc_expanded <- Biomin_broc_orig %>% left_join(broc_groups_sep, by="query")

Biomin_broc_orig_broc_expanded2 <- Biomin_broc_orig_broc_expanded %>% left_join(broc_groups_sep, by="OG_name", suffix = c(".orig", ""))

Biomin_broc_orig_broc_expanded2_clean <- Biomin_broc_orig_broc_expanded2 %>% select(-query.orig) %>% distinct() %>% select(query,everything())

Biomin_broc_orig_broc_expanded2_clean <- Biomin_broc_orig_broc_expanded2_clean %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","),Classification = paste(unique(Classification), collapse = ","),def_short = paste(unique(def_short), collapse = ","))


DE_05_Biomin_broc    <- join_genes_of_interest(DE_05, Biomin_broc_orig_broc_expanded2_clean)
DESeq_Biomin_broc    <- join_genes_of_interest(as.data.frame(resOrdered), Biomin_broc_orig_broc_expanded2_clean)
#Biomin_broc <- Biomin_broc_orig_broc_expanded2_clean

keywords_manual_search_biomin_noortholog <- c("Collagen alpha", "hemicentin", "skeletal organic matrix", "carbonic anhydrase", "coadhesin", "Von Willebrand Factor D")

manual_search_biomin_noortholog <- DESeq_SwissProt %>% filter(grepl(
  paste(keywords_manual_search_biomin_noortholog,collapse="|"),ProteinNames,ignore.case = TRUE)) %>%
  select(query, ProteinNames) %>% 
  rename(definition = ProteinNames) %>% mutate(List="Searched Keywords") %>% 
  filter(!(query %in% Biomin_broc$query)) #only keep these if they're not already in our list

manual_search_biomin_noortholog <- manual_search_biomin_noortholog %>% mutate(
  Classification = 
    ifelse(grepl("Collagen", definition, ignore.case = TRUE), "Extracellular matrix/cell adhesion",
    ifelse(grepl("hemicentin", definition, ignore.case = TRUE), "Extracellular matrix/cell adhesion",
    ifelse(grepl("Uncharacterized skeletal organic matrix", definition, ignore.case = TRUE), "Uncharacterized biomineralization proteins",
    ifelse(grepl("MAM and LDL", definition, ignore.case = TRUE), "Extracellular matrix/cell adhesion",
    ifelse(grepl("carbonic anhydrase", definition, ignore.case = TRUE), "Enzymes",
    ifelse(grepl("coadhesin", definition, ignore.case = TRUE), "Extracellular matrix/cell adhesion",
    ifelse(grepl("Von Willebrand Factor D", definition, ignore.case = TRUE), "Extracellular matrix/cell adhesion",NA)))))))
)

manual_search_biomin_noortholog$def_short <- ifelse(nchar(manual_search_biomin_noortholog$definition) > 40, 
                            paste0(substr(manual_search_biomin_noortholog$definition, 1, 37), "..."), 
                            manual_search_biomin_noortholog$definition)

#Biomin_broc_orig <- Biomin_broc

Biomin_broc <- Biomin_broc %>% bind_rows(manual_search_biomin_noortholog)

Biomin_broc_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin_broc$query),]

DE_05_Biomin_broc    <- join_genes_of_interest(DE_05, Biomin_broc)
DESeq_Biomin_broc    <- join_genes_of_interest(as.data.frame(resOrdered), Biomin_broc)

write.csv(as.data.frame(DESeq_Biomin_broc), file=paste0(outdir,"/DESeq_biomin_annotation.csv"))
write.csv(as.data.frame(DE_05_Biomin_broc), file=paste0(outdir,"/DE_05_biomin_annotation.csv"))
Biomin_broc_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin_broc) 
Biomin_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(Biomin_broc) 

Clean labels and try to categorize annotations

Manual_AllGenes <- DESeq_SwissProt %>% mutate(Heatmap_Label = ProteinNames) %>% relocate(Heatmap_Label, .after=evalue)

Manual_AllGenes$Heatmap_Label <- gsub("Homeobox protein", "Hox", Manual_AllGenes$Heatmap_Label, ignore.case = TRUE)
Manual_AllGenes$Heatmap_Label <- gsub("Protein Wnt", "Wnt", Manual_AllGenes$Heatmap_Label, ignore.case = TRUE)

#fill any missing values that are biomin genes with the biomin gene label
Manual_AllGenes <- Manual_AllGenes %>%
  left_join(
    Biomin_broc %>% select(query, definition),
    by = "query"
  ) %>%
  mutate(Heatmap_Label = coalesce(Heatmap_Label, definition)) %>%
  select(-definition)

Manual_AllGenes <- Manual_AllGenes %>% mutate(
        Heatmap_Label = str_replace(Heatmap_Label, "\\s+\\(.*", ""),
        Heatmap_Label = str_replace(Heatmap_Label, "\\s+\\[.*", ""),
        Heatmap_Label_short = str_trunc(Heatmap_Label, 50),
        Heatmap_Label_short = str_wrap(Heatmap_Label, width = 25)
      )

Manual <- Manual_AllGenes %>% filter(query %in% DE_05$query)

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes
### Zona pellucida domain
PFAM_ZP_domain <- annot_all %>% filter(grepl("Zona_pellucida",Short.name,ignore.case=TRUE))  %>% pull(query) %>% unique()

#### He et al 2023 TFs/genes of interest Nematostella (NOTE! I confirmed the DE_05 ones are all represented above ) + DuBuc et al 2018 Wnt/Aboral-Oral Patterning Genes, interest Nematostella

DESeq_He_etal$source <- "He_etal_2023"
DESeq_DuBuc_etal$source <- "DuBuc_etal_2018"

Hox_Literature <- rbind(DESeq_He_etal,DESeq_DuBuc_etal)

DESeq_Hox_Literature_collapsed <- Hox_Literature %>%
  group_by(query, baseMean, log2FoldChange, lfcSE, pvalue, padj) %>%
  summarize(
    Gene_Name = paste(unique(Gene_Name), collapse = "; "),
    Description = paste(unique(Description), collapse = "; "),
    def_short = paste(unique(def_short), collapse = "; "),
    source = paste(unique(source), collapse = "; "),
    .groups = "drop"
  )

WNT_HOX_MAN <- Manual_AllGenes %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE)|grepl("frizzle", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE)) %>% select(-c(blast_hit,evalue,ProteinNames,BiologicalProcess,GeneOntologyIDs))# %>% rename(Description = Heatmap_Label)

COMBINED_HOX_WNT_all <-  full_join(DESeq_Hox_Literature_collapsed,WNT_HOX_MAN,by=c("query")) %>%
  mutate(
    baseMean = coalesce(baseMean.x, baseMean.y),
    log2FoldChange = coalesce(log2FoldChange.x, log2FoldChange.y),
    lfcSE = coalesce(lfcSE.x, lfcSE.y),
    pvalue = coalesce(pvalue.x, pvalue.y),
    padj = coalesce(padj.x, padj.y)
  ) %>%
  select(-ends_with(".x"), -ends_with(".y"))

COMBINED_HOX_WNT_all <- COMBINED_HOX_WNT_all %>%  mutate(Heatmap_Label = coalesce(Heatmap_Label, def_short)) %>%
                              mutate(Heatmap_Label = gsub("Protein Wnt", "Wnt", Heatmap_Label, ignore.case = TRUE)) %>% 
                              mutate(pathway = ifelse(grepl("ox",Heatmap_Label)|grepl("Six",Heatmap_Label), "HOX", "WNT"))
COMBINED_HOX_WNT_DE05 <-  COMBINED_HOX_WNT_all %>% filter(padj<0.05) %>% filter(abs(log2FoldChange) >1)

write.csv(COMBINED_HOX_WNT_all, file=paste0(outdir,"/HOX_Wnt_genes.csv"))
cluster_8_9_genes <- read.csv("../output_RNA/differential_expression/semantic-enrichment/cluster_gene_lists/Oral_enrich_BP_Sensory.csv")
library(purrr)

DESeq_SwissProt_categorized <- Manual_AllGenes %>% rowwise() %>%
  mutate(category = list(
    c(if (query %in% DESeq_Biomin_broc$query) "Biomineralization",
      if (grepl("transient", ProteinNames, ignore.case = TRUE)) "TRP Channel",
      if ((grepl("mucin|lectin|toll-|nitric|sulfotransferase|Complement", ProteinNames, ignore.case = TRUE) |
           grepl("toll-like receptor|NF-kappaB|complement activation|3'-phosphoadenosine 5'-phosphosulfate",
                 BiologicalProcess, ignore.case = TRUE)) & !grepl("Collectin", ProteinNames)) "Mucus- and Bacteria-related",
      if (grepl("amino acid transport|ammonium transmembrane transport|carbohydrate transport|
                glycerol transmembrane transport|cholesterol transport", BiologicalProcess, ignore.case = TRUE) &
          !grepl("neurotransmitter|synaptic|pigmentation", BiologicalProcess, ignore.case = TRUE)) "Transport",
      if (grepl("Solute carrier", ProteinNames, ignore.case = TRUE)) "Solute Carrier (SLC Family)",
      if (query %in% cluster_8_9_genes) "Sensors",
      if (query %in% PFAM_ZP_domain) "Zona Pellucida",
      if (query %in% COMBINED_HOX_WNT_all$query) "Hox/Wnt Developmental Signaling"
    )
  )) %>%
  ungroup()

DESeq_SwissProt_categorized_collapsed <- DESeq_SwissProt_categorized %>%
  mutate(category = sapply(category, function(x) paste(x, collapse = "; "))) %>%
  relocate(category, .after = ProteinNames)

category_annotation <- DESeq_SwissProt_categorized_collapsed %>%
  select(query, category) %>%
  column_to_rownames("query")

write.csv(DESeq_SwissProt_categorized_collapsed, file=paste0(outdir,"/DESeq_SwissProt_annotation_categorized.csv"))

Test out new labels on heatmaps

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         annotation_row=category_annotation,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "top50_DE_Manual")

dev.off()
## null device 
##           1
#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE_LFC <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         annotation_row=category_annotation,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE_LFC
save_ggplot(top50_DE_LFC, "top50_LFC_DE_Manual")
dev.off()
## null device 
##           1
#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
          annotation_row=category_annotation,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_Aboral
save_ggplot(up_Aboral, "up_Aboral_Manual")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
          annotation_row=category_annotation,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_OralEpi
save_ggplot(up_OralEpi, "up_OralEpi_Manual")

Expression of certain genes of interest: blast against P. acuta genome

yin yang Transctiption factor

cd ../references

#download the genome protein fasta if you have not already
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

#unzip file
gunzip Pocillopora_acuta_HIv2.genes.pep.faa.gz

In unity

salloc -p cpu -c 8 --mem 32G

module load uri/main
module load BLAST+/2.15.0-gompi-2023a

#make blast_dbs directory if you haven't done so above
mkdir blast_dbs
cd blast_dbs

makeblastdb -in ../Pocillopora_acuta_HIv2.genes.pep.faa -out Pacuta_prot -dbtype prot

cd ../../output_RNA/differential_expression/

#make blast output directory if you haven't done so above
mkdir blast
cd blast

nano YinYang.txt

add accession numbers of interest:

  • nematostella vectensis transcriptional repressor protein YY1 isoforms
    • XP_048585772.1
    • XP_048585773.1
    • XP_048585774.1
  • human transcription factor YY2
    • NP_996806.2
  • human transcription factor ZFP41 (REX-1)
    • NP_777560.2
XP_048585772.1
XP_048585773.1
XP_048585774.1
NP_996806.2
NP_777560.2
# Read the input file line by line and fetch FASTA sequences
while read -r accession; do
  if [[ -n "$accession" ]]; then
    echo "Fetching $accession..."
    curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=${accession}&rettype=fasta&retmode=text" >> "YinYang.fasta"
    echo >> "YinYang.fasta"  # Add a newline between sequences
    sleep 1  # Avoid hitting rate limits
  fi
done < "YinYang.txt"

# run blast with human-readable output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results.txt -outfmt 0

#looks like there are a lot of matches for each gene! I am going to do a tab search with a very low e-value cutoff:

# run blast with tabular output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results_tab.txt -outfmt 6 -evalue 1e-25

Great! Interestingly, the same gene, Pocillopora_acuta_HIv2_RNAseq.g25242.t1 is the top match for all 5 proteins I searched. Pocillopora_acuta_HIv2_TS.g24434.t1 is the second best match for all 5. That is interesting! Pocillopora_acuta_HIv2_RNAseq.g7583.t1 is also worth looking at. And Pocillopora_acuta_HIv2_TS.g21338.t1 matched all three YY1 isoforms.

YinYangs <- c("Pocillopora_acuta_HIv2___RNAseq.g25242.t1",
              "Pocillopora_acuta_HIv2___TS.g24434.t1",
              "Pocillopora_acuta_HIv2___RNAseq.g7583.t1",
              "Pocillopora_acuta_HIv2___TS.g21338.t1")

for (i in 1:length(YinYangs)){
  plotCounts(dds, gene=YinYangs[i], intgroup=c("Tissue"),)
}

as.data.frame(resOrdered)[YinYangs,]
##                                             baseMean log2FoldChange     lfcSE
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 2388.43282     0.67034225 0.8174612
## Pocillopora_acuta_HIv2___TS.g24434.t1       86.14232     0.15295231 1.0061439
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   429.08727    -0.04567773 0.9611454
## Pocillopora_acuta_HIv2___TS.g21338.t1      277.12637     0.05272309 1.0097629
##                                              pvalue      padj
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 0.1696289 0.3781033
## Pocillopora_acuta_HIv2___TS.g24434.t1     0.2935493 0.5402412
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1  0.8212499 0.9266460
## Pocillopora_acuta_HIv2___TS.g21338.t1     0.2451260 0.4833504
##                                                                               query
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 Pocillopora_acuta_HIv2___RNAseq.g25242.t1
## Pocillopora_acuta_HIv2___TS.g24434.t1         Pocillopora_acuta_HIv2___TS.g24434.t1
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   Pocillopora_acuta_HIv2___RNAseq.g7583.t1
## Pocillopora_acuta_HIv2___TS.g21338.t1         Pocillopora_acuta_HIv2___TS.g21338.t1

Okay! None of these genes are differentially expressed between the tissues. That is interesting and good to know. Pocillopora_acuta_HIv2___RNAseq.g25242.t1 has the highest basal expression of all the potential isoforms of this transcription factor.

TRP Channels

DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 80, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 77), "..."), 
                            DE_05_SwissProt$ProteinNames)

TRP <- Manual %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE)) 
select <- TRP$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 12)
top50_DE

annot_tab$short_name <- ifelse(nchar(annot_tab$ProteinNames) > 80, 
                            paste0(substr(annot_tab$ProteinNames, 1, 77), "..."), 
                            annot_tab$ProteinNames)

swissprot_labels <- annot_tab  %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

TRP <- annot_tab %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE))
select1 <- TRP$query
select <- match(select1,rownames(vsd))
select <- select[!is.na(select)]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = swissprot_labels[match(select1,swissprot_labels$query),2], fontsize_row = 6)
top50_DE

biomin de heatmap nice

DE_05_Biomin_broc_filtered <- DE_05_Biomin_broc %>% left_join(Manual,by="query" ) #%>% filter(!(Heatmap_Label %in% c("Myosin-9", "Actin, cytoplasmic")))
  
select <- DE_05_Biomin_broc_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))

annotation_row = DE_05_Biomin_broc_filtered %>%
    select(query, Classification) %>%
    tibble::column_to_rownames("query")

DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_row=annotation_row,
         annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "DE_Biomin_broc",height=10)

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_Biomin_broc <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_row=annotation_row,
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(53) 
)
save_ggplot(DE_Biomin_broc, "clusters_clean/DE_Biomin_broc",height=10)

labels_all <- Manual_AllGenes %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) 

DESeq_Biomin_broc_filtered <- DESeq_Biomin_broc %>% left_join(Manual_AllGenes %>% select(query,Heatmap_Label))
  
select <- DESeq_Biomin_broc_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))

annotation_row = DESeq_Biomin_broc_filtered %>%
    select(query, Classification) %>%
    tibble::column_to_rownames("query")

DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_row=annotation_row,
         annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = labels_all[match(select,(labels_all$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "ALL_Biomin_broc",bg="white",height = 18)

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = labels_all[match(select, labels_all$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_Biomin_broc <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  annotation_row=annotation_row,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(81) 
)
save_ggplot(DE_Biomin_broc, "clusters_clean/All_Biomin_broc",bg="white",height = 18)

vst_counts <- assay(vsd)["Pocillopora_acuta_HIv2___RNAseq.g15280.t1", ]
df <- as.data.frame(colData(dds))
df$expression <- vst_counts

library(ggplot2)
ggplot(df, aes(x=Tissue, y=expression, group=Fragment, color=Fragment)) +
  geom_point(size=3) +
  geom_line() +
  theme_minimal() +
  labs(title="VST Expression by Tissue", y="VST normalized expression")

Marker DE nice

#view marker genes that are differentially expressed
select <- DE_05_marker$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = DE_05_marker$List, fontsize_row = 7)
DE_marker

#view marker genes that are differentially expressed
select <- DE_05_marker$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = paste0(DE_05_marker$List,": ", gene_labels[match(select,(gene_labels$query)),2]), fontsize_row = 7)
DE_marker

#view marker genes that are differentially expressed
select <- DE_05_marker_broc$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = DE_05_marker_broc$List, fontsize_row = 7)
DE_marker_broc

#view marker genes that are differentially expressed
select <- DE_05_marker_broc$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = paste0(DE_05_marker_broc$List,": ", gene_labels[match(select,(gene_labels$query)),2]), fontsize_row = 6)
DE_marker_broc

DE_05_marker_broc <- DE_05_marker_broc %>% left_join(Manual,by=c("query")) %>% select(!ends_with(".y"))

Polar plot

heatmap_meta_df <- heatmap_metadata %>%
  tibble::rownames_to_column("sample")
#get list of all expressed marker genes
markers <- DESeq_marker_broc %>% select(query, Standardized_Name_spA) %>% distinct()

# get VST expression values
vsd_mat <- as.data.frame(assay(vsd)) %>% 
  tibble::rownames_to_column("gene")

# make VST expression value df into long format, and add marker info, filter to only keep expressed marker genes

mat_long <- vsd_mat %>%
  pivot_longer(-gene, names_to = "sample", values_to = "vst") %>%
  left_join(heatmap_meta_df %>% select(sample, Tissue), by = "sample") %>%
  left_join(markers, by = c("gene" = "query")) %>%
  filter(!is.na(Standardized_Name_spA))

# gene mean by tissue
gene_tissue <- mat_long %>%
  group_by(Standardized_Name_spA, gene, Tissue) %>%
  summarise(mean_vst = mean(vst, na.rm = TRUE), .groups = "drop")

plot_df <- gene_tissue %>%
  pivot_wider(names_from = Tissue, values_from = mean_vst, values_fill = 0)
tissue_cols <- setdiff(colnames(plot_df), c("Standardized_Name_spA", "gene"))

# outer = OralEpi (t1), inner = Aboral (t2)
t1 <- tissue_cols[tissue_cols=="OralEpi"]
t2 <- tissue_cols[tissue_cols=="Aboral"]

# Sort sectors by total expression for better visual flow
sector_totals <- plot_df %>%
  group_by(Standardized_Name_spA) %>%
  summarise(total_expr = sum(.data[[t1]] + .data[[t2]], na.rm = TRUE)) %>%
  arrange(desc(total_expr))

plot_df <- plot_df %>%
  mutate(Standardized_Name_spA = factor(Standardized_Name_spA, 
                                       levels = sector_totals$Standardized_Name_spA))

gene_counts <- table(plot_df$Standardized_Name_spA)
xlims <- cbind(rep(0, length(gene_counts)), as.numeric(gene_counts))
maxval <- max(c(plot_df[[t1]], plot_df[[t2]]), na.rm = TRUE)
maxval <- signif(maxval,0)
ylim <- c(0, maxval)

png("../output_RNA/differential_expression/Allbrocmarker_circ.png", width = 3000, height = 3000, res = 300)

circos.clear()
circos.par(start.degree = 90, gap.after = 4, cell.padding = c(0.01, 0, 0.01, 0))
circos.initialize(factors = names(gene_counts), xlim = xlims)

# Sector labels with improved styling
circos.track(
  ylim = c(0, 1), 
  track.height = 0.08, 
  bg.border = NA,
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    # Calculate text size based on sector name length
    text_cex <- min(0.7, 10/nchar(sector))
    circos.text(CELL_META$xcenter, 0.5, sector,
                facing = "bending.inside", 
                cex = text_cex,
                font = 2)  # Bold text
  }
)

circos.trackPlotRegion(
  ylim = c(-maxval, maxval),  # negative to positive
  track.height = 0.5,
  bg.border = "gray90",
  panel.fun = function(x, y) {
    
    sector <- CELL_META$sector.index
    genes <- plot_df %>% filter(Standardized_Name_spA == sector)

    # Add horizontal grid lines at standardized levels
    grid_levels <- c(maxval * -0.25, maxval * -0.5, maxval * -0.75, maxval * 0.25, maxval * 0.5, maxval * 0.75)
    for (level in grid_levels) {
      circos.lines(c(0, CELL_META$xlim[2]), c(level, level),
                   col = "gray90", lwd = 0.8)
    }

    # Add y-axis labels on first sector
    if (CELL_META$sector.numeric.index == 1) {
      circos.yaxis(side = "left", at = c(-maxval, -maxval/2, 0, maxval/2, maxval),
                   labels = c(sprintf("%.0f", -maxval), sprintf("%.0f", -maxval/2),"0", sprintf("%.0f", maxval/2), sprintf("%.0f", maxval)),
                   tick.length = 0.15, labels.cex = 0.8, lwd = 1)
    }
    
    for (i in seq_len(nrow(genes))) {
      # Tissue 1 goes up (positive)
      circos.rect(i-1, 0, i, genes[[t1]][i], col = "#FD8D3C")
      # Tissue 2 goes down (negative) 
      circos.rect(i-1, 0, i, -genes[[t2]][i], col = "mediumpurple1")
    }
  }
)

# Enhanced legend with better positioning and styling
par(xpd = TRUE)
legend(x = -1.3, y = 1.3, 
       legend = paste(c("Inner:", "Outer:"), c(t1, t2)), 
       fill = colors,
       border = "white",
       bty = "n",
       cex = 0.9,
       pt.cex = 1.2)

# Add subtitle with sample info
n_genes <- nrow(plot_df)
n_sectors <- length(unique(plot_df$Standardized_Name_spA))
mtext(sprintf("VST-normalized expression | %d genes across %d gene categories", 
              n_genes, n_sectors),
      side = 3, line = -1, cex = 0.8, col = "gray40")

dev.off()
## quartz_off_screen 
##                 2
# Prepare data using existing DESeq results
plot_df <- DESeq_marker_broc %>%
  select(query, Standardized_Name_spA, log2FoldChange) %>%
  # Remove any rows with missing data
  filter(!is.na(log2FoldChange), !is.na(Standardized_Name_spA)) %>%
  # Rename for clarity
  rename(gene = query, log2FC = log2FoldChange)

#plot_df <- plot_df %>% mutate(log2FC = -1*log2FC)
  
# Sort sectors by total absolute fold change for better visual flow
sector_totals <- plot_df %>%
  group_by(Standardized_Name_spA) %>%
  summarise(total_abs_fc = sum(abs(log2FC), na.rm = TRUE)) %>%
  arrange(desc(total_abs_fc))

plot_df <- plot_df %>%
  mutate(Standardized_Name_spA = factor(Standardized_Name_spA, 
                                       levels = sector_totals$Standardized_Name_spA))

# Set up circos parameters
gene_counts <- table(plot_df$Standardized_Name_spA)
xlims <- cbind(rep(0, length(gene_counts)), as.numeric(gene_counts))

# Create symmetric scale around 0
max_lfc <- max(abs(plot_df$log2FC), na.rm = TRUE)
max_lfc <- ceiling(max_lfc)  # Round up to nearest integer for clean axis
ylim <- c(-max_lfc, max_lfc)

# Define colors for positive and negative fold changes
pos_color <- "#FD8D3C"  # Orange for positive (higher in tissue1)
neg_color <- "mediumpurple1"  # Purple for negative (higher in tissue2)

# Create the plot
png("../output_RNA/differential_expression/Allbrocmarker_lfc_circ.png", 
    width = 3000, height = 3000, res = 300)

circos.clear()
circos.par(start.degree = 90, gap.after = 8, cell.padding = c(0.01, 0, 0.01, 0))
circos.initialize(factors = names(gene_counts), xlim = xlims)

# Sector labels
circos.track(
  ylim = c(0, 1), 
  track.height = 0.08, 
  bg.border = NA,
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    text_cex <- min(0.7, 10/nchar(sector))
    circos.text(CELL_META$xcenter, 0.5, sector,
                facing = "bending.inside", 
                cex = text_cex,
                font = 2)
  }
)

# Main track with diverging bars
circos.trackPlotRegion(
  ylim = ylim,
  track.height = 0.4,
  bg.border = "gray90",
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    genes <- plot_df %>% filter(Standardized_Name_spA == sector)
    
    # Add horizontal grid lines
    grid_levels <- seq(-max_lfc, max_lfc, by = max_lfc/2)
    for (level in grid_levels) {
      circos.lines(c(0, CELL_META$xlim[2]), c(level, level), 
                   col = "gray92", lwd = 0.8)
    }
    
    # Add center line at 0
    circos.lines(c(0, CELL_META$xlim[2]), c(0, 0), 
                 col = "gray60", lwd = 1.2)
    
    # Plot bars
    for (i in seq_len(nrow(genes))) {
      lfc_val <- genes$log2FC[i]
      bar_color <- ifelse(lfc_val > 0, pos_color, neg_color)
      
      circos.rect(i-1, 0, i, lfc_val,
                  col = bar_color, border = "white", lwd = 0.3)
    }
    
    # Add y-axis labels on first sector
    if (CELL_META$sector.numeric.index == 1) {
      axis_at <- seq(-max_lfc, max_lfc, by = max_lfc/2)
      circos.yaxis(side = "left", at = axis_at,
                   labels = sprintf("%.0f", axis_at),
                   tick.length = 0.15, labels.cex = 0.8, lwd = 1)
    }
  }
)

# Enhanced legend
par(xpd = TRUE)
legend(x = -1.3, y = 1.3, 
       legend = c(paste("Higher in", t1), "Equal", paste("Higher in", t2)), 
       fill = c(pos_color, "gray90", neg_color),
       border = "white",
       bty = "n",
       cex = 0.9,
       pt.cex = 1.2)

# Add informative subtitle
n_genes <- nrow(plot_df)
n_sectors <- length(unique(plot_df$Standardized_Name_spA))
mtext(sprintf("Log2 Fold Change | %d genes across %d categories", 
              n_genes, n_sectors),
      side = 3, line = -1, cex = 0.8, col = "gray40")

# Add interpretation guide
mtext("Positive = higher in tissue1 | Negative = higher in tissue2", 
      side = 1, line = -1, cex = 0.7, col = "gray50")

dev.off()
## quartz_off_screen 
##                 2
#get list of all expressed marker genes
markers <- DE_05_marker_broc %>% select(query, Standardized_Name_spA) %>% distinct()

# get VST expression values
vsd_mat <- as.data.frame(assay(vsd)) %>% 
  tibble::rownames_to_column("gene")

# make VST expression value df into long format, and add marker info, filter to only keep expressed marker genes

mat_long <- vsd_mat %>%
  pivot_longer(-gene, names_to = "sample", values_to = "vst") %>%
  left_join(heatmap_meta_df %>% select(sample, Tissue), by = "sample") %>%
  left_join(markers, by = c("gene" = "query")) %>%
  filter(!is.na(Standardized_Name_spA))

# gene mean by tissue
gene_tissue <- mat_long %>%
  group_by(Standardized_Name_spA, gene, Tissue) %>%
  summarise(mean_vst = mean(vst, na.rm = TRUE), .groups = "drop")

plot_df <- gene_tissue %>%
  pivot_wider(names_from = Tissue, values_from = mean_vst, values_fill = 0)
tissue_cols <- setdiff(colnames(plot_df), c("Standardized_Name_spA", "gene"))

# outer = OralEpi (t1), inner = Aboral (t2)
t1 <- tissue_cols[tissue_cols=="OralEpi"]
t2 <- tissue_cols[tissue_cols=="Aboral"]

# Sort sectors by total expression for better visual flow
sector_totals <- plot_df %>%
  group_by(Standardized_Name_spA) %>%
  summarise(total_expr = sum(.data[[t1]] + .data[[t2]], na.rm = TRUE)) %>%
  arrange(desc(total_expr))

plot_df <- plot_df %>%
  mutate(Standardized_Name_spA = factor(Standardized_Name_spA, 
                                       levels = sector_totals$Standardized_Name_spA))

gene_counts <- table(plot_df$Standardized_Name_spA)
xlims <- cbind(rep(0, length(gene_counts)), as.numeric(gene_counts))
maxval <- max(c(plot_df[[t1]], plot_df[[t2]]), na.rm = TRUE)
maxval <- signif(maxval,0)
ylim <- c(0, maxval)

png("../output_RNA/differential_expression/DE_05_brocmarker_circ.png", width = 3000, height = 3000, res = 300)

circos.clear()
circos.par(start.degree = 90, gap.after = 4, cell.padding = c(0.01, 0, 0.01, 0))
circos.initialize(factors = names(gene_counts), xlim = xlims)

# Sector labels with improved styling
circos.track(
  ylim = c(0, 1), 
  track.height = 0.08, 
  bg.border = NA,
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    # Calculate text size based on sector name length
    text_cex <- min(0.7, 10/nchar(sector))
    circos.text(CELL_META$xcenter, 0.5, sector,
                facing = "bending.inside", 
                cex = text_cex,
                font = 2)  # Bold text
  }
)

circos.trackPlotRegion(
  ylim = c(-maxval, maxval),  # negative to positive
  track.height = 0.5,
  bg.border = "gray90",
  panel.fun = function(x, y) {
    
    sector <- CELL_META$sector.index
    genes <- plot_df %>% filter(Standardized_Name_spA == sector)

    # Add horizontal grid lines at standardized levels
    grid_levels <- c(maxval * -0.25, maxval * -0.5, maxval * -0.75, maxval * 0.25, maxval * 0.5, maxval * 0.75)
    for (level in grid_levels) {
      circos.lines(c(0, CELL_META$xlim[2]), c(level, level),
                   col = "gray90", lwd = 0.8)
    }

    # Add y-axis labels on first sector
    if (CELL_META$sector.numeric.index == 1) {
      circos.yaxis(side = "left", at = c(-maxval, -maxval/2, 0, maxval/2, maxval),
                   labels = c(sprintf("%.0f", -maxval), sprintf("%.0f", -maxval/2),"0", sprintf("%.0f", maxval/2), sprintf("%.0f", maxval)),
                   tick.length = 0.15, labels.cex = 0.8, lwd = 1)
    }
    
    for (i in seq_len(nrow(genes))) {
      # Tissue 1 goes up (positive)
      circos.rect(i-1, 0, i, genes[[t1]][i], col = "#FD8D3C",border="white")
      # Tissue 2 goes down (negative) 
      circos.rect(i-1, 0, i, -genes[[t2]][i], col = "mediumpurple1",border="white")
    }
  }
)

# Enhanced legend with better positioning and styling
par(xpd = TRUE)
legend(x = -1.3, y = 1.3, 
       legend = paste(c("Inner:", "Outer:"), c(t1, t2)), 
       fill = colors,
       border = "white",
       bty = "n",
       cex = 0.9,
       pt.cex = 1.2)

# Add subtitle with sample info
n_genes <- nrow(plot_df)
n_sectors <- length(unique(plot_df$Standardized_Name_spA))
mtext(sprintf("VST-normalized expression | %d genes across %d gene categories", 
              n_genes, n_sectors),
      side = 3, line = -1, cex = 0.8, col = "gray40")

dev.off()
## quartz_off_screen 
##                 2
tissue_cols <- setdiff(colnames(plot_df), c("Standardized_Name_spA", "gene"))

# outer = OralEpi (t1), inner = Aboral (t2)
t1 <- tissue_cols[tissue_cols=="OralEpi"]
t2 <- tissue_cols[tissue_cols=="Aboral"]

# Sort sectors by tissue dominance for better visual flow
sector_totals <- plot_df %>%
  group_by(Standardized_Name_spA) %>%
  summarise(
    n_t1 = sum(.data[[t1]] > .data[[t2]], na.rm = TRUE),
    n_t2 = sum(.data[[t2]] > .data[[t1]], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    dom_group = if_else(n_t1 > n_t2, "t1_dominant", "t2_dominant"),
    sort_value = if_else(dom_group == "t1_dominant", -n_t1, n_t2)  # negative makes n_t1 descending
  ) %>%
  arrange(
    factor(dom_group, levels = c("t1_dominant", "t2_dominant")),
    sort_value
  )

plot_df <- plot_df %>%
  mutate(Standardized_Name_spA = factor(Standardized_Name_spA, 
                                       levels = sector_totals$Standardized_Name_spA))


gene_counts <- table(plot_df$Standardized_Name_spA)
xlims <- cbind(rep(0, length(gene_counts)), as.numeric(gene_counts))
maxval <- max(c(plot_df[[t1]], plot_df[[t2]]), na.rm = TRUE)
maxval <- signif(maxval,0)
ylim <- c(0, maxval)

png("../output_RNA/differential_expression/DE_05_brocmarker_circ_colored.png", width = 3000, height = 3000, res = 300)

circos.clear()
circos.par(start.degree = 90, gap.after = 5, cell.padding = c(0.01, 0, 0.01, 0))
circos.initialize(factors = names(gene_counts), xlim = xlims)

# Sector labels with improved styling
circos.track(
  ylim = c(0, 1), 
  track.height = 0.08, 
  bg.border = NA,
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    # Calculate text size based on sector name length
    text_cex <- min(0.7, 10/nchar(sector))
    circos.text(CELL_META$xcenter, 0.5, sector,
                facing = "bending.inside", 
                cex = text_cex,
                font = 2)  # Bold text
  }
)

circos.trackPlotRegion(
  ylim = c(-maxval, maxval),  # negative to positive
  track.height = 0.5,
  bg.border = "gray90",
  panel.fun = function(x, y) {
    
    sector <- CELL_META$sector.index
    
    genes <- plot_df %>% filter(Standardized_Name_spA == sector) %>%
                      mutate(
              dom_group = if_else(.data[[t1]] > .data[[t2]], "t1_dominant", "t2_dominant")
            ) %>%
            arrange(
              factor(dom_group, levels = c("t1_dominant", "t2_dominant")),
              desc(if_else(dom_group == "t1_dominant", .data[[t1]], -.data[[t2]]))
            )

    # Add horizontal grid lines at standardized levels
    grid_levels <- c(maxval * -0.25, maxval * -0.5, maxval * -0.75, maxval * 0.25, maxval * 0.5, maxval * 0.75)
    for (level in grid_levels) {
      circos.lines(c(0, CELL_META$xlim[2]), c(level, level),
                   col = "gray90", lwd = 0.8)
    }
    
    for (i in seq_len(nrow(genes))) {
      t1_val <- genes[[t1]][i]
      t2_val <- genes[[t2]][i]
      
      # Determine colors based on dominance
      if (t1_val > t2_val) {
        t1_col <- "#FD8D3C"  # Bright color for dominant
        t2_col <- "#E6D3F7"  # Muted color for non-dominant
      } else {
        t1_col <- "#FED8A6"  # Muted color for non-dominant
        t2_col <- "mediumpurple1"  # Bright color for dominant
      }
      
      # Add border to emphasize dominant bar
      t1_border <- "white" # ifelse(t1_val > t2_val, "black", "white")
      t2_border <- "white" #ifelse(t2_val > t1_val, "black", "white")
      
      # Tissue 1 goes up (positive)
      circos.rect(i-1, 0, i, t1_val, col = t1_col, border = t1_border, lwd = 0.5)
      # Tissue 2 goes down (negative) 
      circos.rect(i-1, 0, i, -t2_val, col = t2_col, border = t2_border, lwd = 0.5)
    }
    
        # Add y-axis labels on first sector
    if (CELL_META$sector.numeric.index == 1) {
      circos.yaxis(side = "left", at = c(-maxval, -maxval/2, 0, maxval/2, maxval),
                   labels = c(sprintf("%.0f", -maxval), sprintf("%.0f", -maxval/2),"0", sprintf("%.0f", maxval/2), sprintf("%.0f", maxval)),
                   tick.length = 0.15, labels.cex = 0.8, lwd = 1)
    }
  }
)

# Enhanced legend with better positioning and styling
par(xpd = TRUE)
legend("topright", 
       legend = paste(c("Outer:", "Inner:"), c(t1, t2)), 
       fill = c("#FD8D3C", "mediumpurple1"),
       border = "black",
       bty = "o",  # Draw box around legend
       cex = 1.0,
       title = "Tissue Type",
       title.cex = 1.1)

# Add subtitle with sample info
n_genes <- nrow(plot_df)
n_sectors <- length(unique(plot_df$Standardized_Name_spA))
mtext(sprintf("VST-normalized expression | %d genes across %d gene categories", 
              n_genes, n_sectors),
      side = 3, line = 0, cex = 0.9, col = "gray40")

dev.off()
## quartz_off_screen 
##                 2
# Prepare data using existing DESeq results
plot_df <- DE_05_marker_broc %>%
  select(query, Standardized_Name_spA, log2FoldChange.x) %>%
  # Remove any rows with missing data
  filter(!is.na(log2FoldChange.x), !is.na(Standardized_Name_spA)) %>%
  # Rename for clarity
  rename(gene = query, log2FC = log2FoldChange.x)

#plot_df <- plot_df %>% mutate(log2FC = -1*log2FC)
  
# Sort sectors by total absolute fold change for better visual flow
sector_totals <- plot_df %>%
  group_by(Standardized_Name_spA) %>%
  summarise(total_abs_fc = sum(abs(log2FC), na.rm = TRUE)) %>%
  arrange(desc(total_abs_fc))

plot_df <- plot_df %>%
  mutate(Standardized_Name_spA = factor(Standardized_Name_spA, 
                                       levels = sector_totals$Standardized_Name_spA))

# Set up circos parameters
gene_counts <- table(plot_df$Standardized_Name_spA)
xlims <- cbind(rep(0, length(gene_counts)), as.numeric(gene_counts))

# Create symmetric scale around 0
max_lfc <- max(abs(plot_df$log2FC), na.rm = TRUE)
max_lfc <- ceiling(max_lfc)  # Round up to nearest integer for clean axis
ylim <- c(-max_lfc, max_lfc)

# Define colors for positive and negative fold changes
pos_color <- "#FD8D3C"  # Orange for positive (higher in tissue1)
neg_color <- "mediumpurple1"  # Purple for negative (higher in tissue2)

# Create the plot
png("../output_RNA/differential_expression/DE_05_brocmarker_lfc_circ.png", 
    width = 3000, height = 3000, res = 300)

circos.clear()
circos.par(start.degree = 90, gap.after = 4, cell.padding = c(0.01, 0, 0.01, 0))
circos.initialize(factors = names(gene_counts), xlim = xlims)

# Sector labels
circos.track(
  ylim = c(0, 1), 
  track.height = 0.08, 
  bg.border = NA,
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    text_cex <- min(0.7, 10/nchar(sector))
    circos.text(CELL_META$xcenter, 0.5, sector,
                facing = "bending.inside", 
                cex = text_cex,
                font = 2)
  }
)

# Main track with diverging bars
circos.trackPlotRegion(
  ylim = ylim,
  track.height = 0.4,
  bg.border = "gray90",
  panel.fun = function(x, y) {
    sector <- CELL_META$sector.index
    genes <- plot_df %>% filter(Standardized_Name_spA == sector)
    
    # Add horizontal grid lines
    grid_levels <- seq(-max_lfc, max_lfc, by = max_lfc/2)
    for (level in grid_levels) {
      circos.lines(c(0, CELL_META$xlim[2]), c(level, level), 
                   col = "gray92", lwd = 0.8)
    }
    
    # Add center line at 0
    circos.lines(c(0, CELL_META$xlim[2]), c(0, 0), 
                 col = "gray60", lwd = 1.2)
    
    # Plot bars
    for (i in seq_len(nrow(genes))) {
      lfc_val <- genes$log2FC[i]
      bar_color <- ifelse(lfc_val > 0, pos_color, neg_color)
      
      circos.rect(i-1, 0, i, lfc_val,
                  col = bar_color, border = "white", lwd = 0.3)
    }
    
    # Add y-axis labels on first sector
    if (CELL_META$sector.numeric.index == 1) {
      axis_at <- seq(-max_lfc, max_lfc, by = max_lfc/2)
      circos.yaxis(side = "left", at = axis_at,
                   labels = sprintf("%.0f", axis_at),
                   tick.length = 0.15, labels.cex = 0.8, lwd = 1)
    }
  }
)

# Enhanced legend
par(xpd = TRUE)
legend(x = -1.3, y = 1.3, 
       legend = c(paste("Higher in", t1), "Equal", paste("Higher in", t2)), 
       fill = c(pos_color, "gray90", neg_color),
       border = "white",
       bty = "n",
       cex = 0.9,
       pt.cex = 1.2)

# Add informative subtitle
n_genes <- nrow(plot_df)
n_sectors <- length(unique(plot_df$Standardized_Name_spA))
mtext(sprintf("Log2 Fold Change | %d genes across %d categories", 
              n_genes, n_sectors),
      side = 3, line = -1, cex = 0.8, col = "gray40")

# Add interpretation guide
mtext("Positive = higher in tissue1 | Negative = higher in tissue2", 
      side = 1, line = -1, cex = 0.7, col = "gray50")

dev.off()
## quartz_off_screen 
##                 2

bacteria genes

mucin <- Manual %>% filter(grepl("mucin", ProteinNames, ignore.case = TRUE)|
                             grepl("lectin", ProteinNames, ignore.case = TRUE)|
                             grepl("toll-", ProteinNames, ignore.case = TRUE)|
                             grepl("ZP", ProteinNames, ignore.case = TRUE)|
                             grepl("nitric", ProteinNames, ignore.case = TRUE)|
                             grepl("sulfotransferase", ProteinNames, ignore.case = TRUE)|
                             grepl("Complement ", ProteinNames, ignore.case = TRUE)|
                             grepl("toll-like receptor", BiologicalProcess, ignore.case = TRUE)|
                             grepl("NF-kappaB", BiologicalProcess, ignore.case = TRUE)|
                             grepl("complement activation", BiologicalProcess, ignore.case = TRUE)|
                             grepl("3'-phosphoadenosine 5'-phosphosulfate", BiologicalProcess, ignore.case = TRUE)) %>% 
  filter(Heatmap_Label !="Cnidocyte marker protein (Collectin-11)")

select <- unique(mucin$query)
 
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "bacteria_DE_SwissProt",height=12)

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
mucins <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(43) 
)

# Save the heatmap
save_ggplot(mucins, "clusters_clean/bacteria_DE_SwissProt",height=12)

Solute Carrier (SLC) Family genes

solute_all <- DESeq_SwissProt %>% filter(grepl("Solute carrier", ProteinNames, ignore.case = TRUE))
solute_all <- solute_all %>%
  mutate(SLC_family = str_extract(ProteinNames, "Solute carrier family \\d+")) %>%
  mutate(SLC_family = str_replace(SLC_family,"Solute carrier family ", "SLC")) 

nrow(solute_all)
## [1] 211
solute_de <- Manual %>% filter(grepl("Solute carrier", ProteinNames, ignore.case = TRUE))
solute_de <- solute_de %>%
  mutate(SLC_family = str_extract(ProteinNames, "Solute carrier family \\d+")) %>%
  mutate(SLC_family = str_replace(SLC_family,"Solute carrier family ", "SLC")) 


solute_de_upAboral <- solute_de %>% filter(log2FoldChange < -1)
nrow(solute_de_upAboral)
## [1] 9
solute_de_upOral <- solute_de %>% filter(log2FoldChange > 1)
nrow(solute_de_upOral)
## [1] 22
length(unique(solute_all$SLC_family))
## [1] 45
length(unique(solute_de_upAboral$SLC_family))
## [1] 8
length(unique(solute_de_upOral$SLC_family))
## [1] 11
table(solute_de$SLC_family)
## 
##  SLC1 SLC12 SLC15 SLC16 SLC22 SLC23 SLC24 SLC25 SLC30 SLC32 SLC35 SLC46 SLC53 
##     2     3     1     4     2     2     1     2     1     1     1     1     1 
##  SLC6  SLC7 
##     5     4
#Basing my annotations on: https://re-solute.eu/knowledgebase/slcfamily and https://slc.bioparadigms.org/

slc_map <- data.frame(
  SLC_family = c(
    "SLC1",
    "SLC12",
    "SLC15",
    "SLC16",
    "SLC22",
    "SLC23",
    "SLC24",
    "SLC25",
    "SLC30",
    "SLC32",
    "SLC35",
    "SLC46",
    "SLC53",
    "SLC6",
    "SLC7"
  ),
  Function = c(
    "High-affinity glutamate and neutral amino acid transporter family",
    "Electroneutral cation-coupled Cl cotransporter family",
    "Proton oligopeptide cotransporter family",
    "Monocarboxylate transporter family",
    "Organic cation/anion/zwitterion transporter family",
    "Sodium-dependent ascorbic acid transporter family",
    "Na+/(Ca2+-K+) exchanger family",
    "Mitochondrial carrier family",
    "Zinc transporter family",
    "Vesicular inhibitory amino acid transporter family",
    "Nucleoside-sugar transporter family",
    "Folate transporter family",
    "Phosphate carriers",
    "Sodium- and chloride-dependent neurotransmitter transporter family",
    "Cationic amino acid transporter/glycoprotein-associated family"
  ),
  Role = c(
    "Neurotransmission, Glu metabolism, development, aa homeostasis",
    "cell volume, chloride concentration, blood pressure regulation",
    "Immunity, homeostasis of neuropeptides, peptide absoption",
    "glucose homeostasis / metabolism, lipid metabolism",
    "Drug uptake, endocytosis",
    "Absortion and distribution of ascorbic acid",
        "Skin/hair/eye pigmentation, vision, olfaction",
    "Apoptosis, Glucose/Lipid/Amino acid/Urea, metab., ATP synthesis",
    "Zinc homestasis, development, body weight, glucose metabolism",
    "Development, neurotransmission",
    "Virus infection, development, autophagy, cell proliferation",
    "Folate metabolism, Immunity, Drug uptake",
    "Virus infection, Phosphate homeostasis, development",
    "Cell differentiation, neurotransmission",
    "Nitric oxide synthesis, redox balance, aa homeostasis"
  ),
  Substrate_Specific = c(
    "Glutamate as Neurotransmitter, anionic, neutral amino acids",
    "Chloride, sodium, potassium",
    "Dipeptides, oligopeptides",
    "Ketone bodies, Lactate, Pyruvate, creatine, aromatic amino acids, T2,T3,T4",
    "Drugs, neurotransmitters, purine and pyrimidine nucleosides, niacin, steroid, carnitine, carnitine derivatives, organic anions",
    "Ascorbic acid",
    "Calcium, Potassium, Sodium",
    "Amino acids, Di- tricarboxylic ions, Co-A, Carnitine/acyl-carnitines, H+, metals, nucleotides",
    "Zinc, Managnese, Lead",
    "GABA, Glycine",
    "Purines, pyrimidines, Thiamine, nuclotide-sugars",
    "Folic acid, Heme, Methotrexate",
    "Phosphate",
    "Neurotransmitters, basic, neutral amino acids",
    "Amino acids"
  ),
  Substrate_Broad = c(
    "Amino Acids",
    "Inorganic Ions",
    "Peptides",
    "Energy\nmetabolites",
    "Organic Ions",
    "Ascorbic\nAcid",
    "Inorganic Ions",
    "Mitochondrial",
    "Transition\nElement\nCations",
    "Amino acids;\nNeuro-\ntransmitter",
    "Nucleotides",
    "Folate",
    "Organic Ions",
    "Neuro-\ntransmitter",
    "Amino Acids"
  )
)
solute_de <- solute_de %>%
  left_join(slc_map, by = "SLC_family") 

write.csv(solute_de, file=paste0(outdir,"/DE_05_SLC_genes.csv"))

plot_data <- solute_de %>%
  mutate(Direction = ifelse(log2FoldChange < -1 , "upAboral",
                            ifelse(log2FoldChange > 1, "upOral", "noChange"))) 
# Summarize counts by family and direction
plot_data <- plot_data %>%
  filter(Direction != "noChange") %>%
  group_by(SLC_family, Substrate_Broad, Direction) %>%
  summarise(n = n(), .groups = "drop")

ggplot(plot_data, aes(x = SLC_family, y = n, fill = Direction)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Substrate_Broad, scales = "free_x") +
  scale_fill_manual(values = c("upOral" = "#FD8D3C", "upAboral" = "mediumpurple1")) +
  theme_minimal() +
  labs(title = "Differentially Expressed SLC Genes by Family",
       x = "SLC Family", y = "Number of DE Genes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select <- solute_de$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
SLC <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white", 
  gaps_row = c(21) 
)

temp and light sensing

sensors <- Manual %>% filter(query %in% (category_annotation %>% filter(grepl("Sensor",category)|grepl("Stereo",category)) %>% rownames()))

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  mutate(cluster = factor(cluster, levels = c(1, 3, 2))) %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
transporters_heat <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",
  gaps_row = c(20)
)

Zona pellucida

Amil_ZP <- DESeq_SwissProt %>% filter(blast_hit=="G8HTB6") %>% pull(query) %>% unique()

PFAM_ZP_domain <- annot_all   %>% filter(grepl("Zona_pellucida",Short.name,ignore.case=TRUE))  %>% 
  #filter(!(query %in% Amil_ZP)) %>% 
  pull(query) %>% unique()

Heat Stress Genes

differentially expressed

select <- DE_05_HeatStressGenes %>% arrange(response_type) %>% select(query,gene_name,response_type) %>% unique()
rownames(select) <- select$query

z_scores <- t(scale(t(assay(vsd)[select$query, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_row = (select %>% select(response_type)), annotation_colors = ann_colors,
         labels_row = select$gene_name, fontsize_row = 8, cutree_rows = 2)
top50_DE

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select$query,
  Heatmap_Label = select$gene_name,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  annotation_row = (select %>% select(response_type)),
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 1  # Add breaks only between clusters
)

all

select <- DESeq_HeatStressGenes %>% arrange(response_type) %>% select(query,gene_name,response_type) %>%
    group_by(query) %>% #collapse duplicate rows
  summarise(
    gene_name = paste(unique(gene_name), collapse = "; "),
    response_type = paste(unique(response_type), collapse = "; "),
    .groups = "drop"
  ) %>% arrange(response_type,gene_name)
select <- data.frame(select)
rownames(select) <- select$query

z_scores <- t(scale(t(assay(vsd)[select$query, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2, annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_row = (select %>% select(response_type)), annotation_colors = ann_colors,
         labels_row = select$gene_name, fontsize_row = 8, cutree_rows = 2)
top50_DE

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select$query,
  Heatmap_Label = select$gene_name,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  annotation_row = (select %>% select(response_type)),
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 4  # Add breaks only between clusters
)

wnt and hox

select <- unique(COMBINED_HOX_WNT_DE05$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = COMBINED_HOX_WNT_DE05$Heatmap_Label, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "Hox_All_DE")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = COMBINED_HOX_WNT_DE05$Heatmap_Label,
  cluster = cluster_assignments,
  pathway = COMBINED_HOX_WNT_DE05$pathway
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, pathway, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(9,14,19,25) 
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/Hox_All_DE")

select <- unique(COMBINED_HOX_WNT_all$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = COMBINED_HOX_WNT_all$Heatmap_Label, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "Hox_All")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = COMBINED_HOX_WNT_all$Heatmap_Label,
  cluster = cluster_assignments,
  pathway = COMBINED_HOX_WNT_all$pathway
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, pathway, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(14,19,24,33) 
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/Hox_All")

Aboral Oral Larvae Paper - Pdam genome

Join our data to Pdam annotations

Annot_Pdam <- read.csv("../references/annotation/blastp_Pdam_out.tab", sep = '\t', header = FALSE) %>% select(c(1,2)) %>% dplyr::rename("protein_id" = "V2", "query" = "V1")
Manual_Pdam <- left_join(Manual, Annot_Pdam) 
library(readxl)
larval_aboral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_aboral_enriched_05_")

Manual_Pdam_upaboral <- Manual_Pdam %>% filter(log2FoldChange < 0)
inner_join(larval_aboral_enriched, Manual_Pdam_upaboral, by=c("gene ID"="protein_id")) %>% dim()
## [1] 16 29
#18 genes overlap from the 126 in the paper

larval_oral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_oral_enriched_05_lf")

Manual_Pdam_uporal <- Manual_Pdam %>% filter(log2FoldChange > 0)
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% dim()
## [1]  6 29
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% print()
## # A tibble: 6 Ă— 29
##   `gene ID`      baseMean.x log2FoldChange.x lfcSE.x  stat pvalue.x   padj.x
##   <chr>               <dbl>            <dbl>   <dbl> <dbl>    <dbl>    <dbl>
## 1 XP_027046532.1       88.9             4.29   0.652  5.04 4.54e- 7 8.03e- 5
## 2 XP_027036583.1      563.              3.04   0.287  7.12 1.06e-12 4.11e-10
## 3 XP_027054215.1      918.              2.57   0.275  5.71 1.14e- 8 2.77e- 6
## 4 XP_027054196.1      855.              2.49   0.263  5.65 1.59e- 8 3.76e- 6
## 5 XP_027058731.1     1372.              2.44   0.270  5.35 8.95e- 8 1.85e- 5
## 6 XP_027040620.1     1782.              1.83   0.241  3.45 5.66e- 4 4.98e- 2
## # ℹ 22 more variables: `pfam domain` <chr>, `signal peptide` <chr>,
## #   `astroides ortholog` <chr>, `astroides oral` <chr>,
## #   `clytia ortholog` <chr>, `clytia oral` <chr>, query <chr>,
## #   baseMean.y <dbl>, log2FoldChange.y <dbl>, lfcSE.y <dbl>, pvalue.y <dbl>,
## #   padj.y <dbl>, blast_hit <chr>, perc_ident <dbl>, evalue <dbl>,
## #   Heatmap_Label <chr>, ProteinNames <chr>, BiologicalProcess <chr>,
## #   GeneOntologyIDs <chr>, CellularComponent <chr>, MolecularFunction <chr>, …
#only 6/83 genes overlap


larval_aboral_clusters <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval_scRNA.xlsx") %>% filter(!is.na(pocillopora))

larval_aboral_clusters_Pacuta <- inner_join(larval_aboral_clusters, Manual_Pdam, by=c("pocillopora"="protein_id"))

#my genes upregulated in oral tissue are high in mucous cells, cool

Nice Volcanoes

ann_colors = list(Tissue = c(OralEpi = "#FD8D3C" ,Aboral = "mediumpurple1"))

colorRampPalette(c("mediumpurple1", "white", "#FD8D3C"))(5)
## [1] "#AB82FF" "#D4C0FF" "#FFFFFF" "#FEC69D" "#FD8D3C"
LFC_colors <- ifelse(DESeq_SwissProt$padj < 0.05 & DESeq_SwissProt$log2FoldChange > 1, '#FD8D3C',
                  ifelse(DESeq_SwissProt$padj < 0.05 & DESeq_SwissProt$log2FoldChange > 0 , '#FEC69D',
                         ifelse(DESeq_SwissProt$padj < 0.05 & DESeq_SwissProt$log2FoldChange < -1, '#AB82FF',
                                ifelse(DESeq_SwissProt$padj < 0.05 & DESeq_SwissProt$log2FoldChange < 0, '#D4C0FF',
                                       'grey'))))


LFC_colors[is.na(LFC_colors)] <- 'grey'
  names(LFC_colors)[LFC_colors == '#FD8D3C'] <- 'Up_Oral_LFC_sig'
  names(LFC_colors)[LFC_colors == '#FEC69D'] <- 'Up_Oral'
  names(LFC_colors)[LFC_colors == 'grey'] <- 'n.s.'
  names(LFC_colors)[LFC_colors == '#AB82FF'] <- 'Up_Aboral_LFC_sig'
  names(LFC_colors)[LFC_colors == '#D4C0FF'] <- 'Up_Aboral'
  
  
DESeq_SwissProt$short_name <- ifelse(nchar(DESeq_SwissProt$ProteinNames) > 25, 
                            paste0(substr(DESeq_SwissProt$ProteinNames, 1, 22), "..."), 
                            DESeq_SwissProt$ProteinNames)
 
plot <- EnhancedVolcano(DESeq_SwissProt, 
    lab = ifelse(DESeq_SwissProt$padj < 0.05 & abs(DESeq_SwissProt$log2FoldChange) > 10, DESeq_SwissProt$short_name, ""),
    xlim = c(min(DESeq_SwissProt[["log2FoldChange"]], na.rm = TRUE) - 5, max(DESeq_SwissProt[["log2FoldChange"]], na.rm = TRUE) + 5),
  #ylim = c(0, max(-log10(toptable[[y]]), na.rm = TRUE) + 5),
    labSize = 3,
    axisLabSize = 8,
    captionLabSize = 4,
    x = "log2FoldChange", 
    y = "padj",
    pCutoff = 0.05,
    colCustom = LFC_colors, colAlpha = .7,
    shape=16,
    title = NULL, subtitle = NULL, 
    legendPosition = "bottom",
    legendLabSize = 8,
    pointSize = 1,
    legendIconSize = 3,
    drawConnectors = TRUE,
    widthConnectors = 0.5,
    directionConnectors = "x",
    border = 'full',
    max.overlaps = Inf,
    maxoverlapsConnectors = 0,
    arrowheads =FALSE,
    min.segment.length = 0,
    gridlines.major = FALSE,
  gridlines.minor = FALSE
) #+ coord_flip() + scale_x_reverse()

plot

plot <- EnhancedVolcano(DESeq_SwissProt, 
    lab = ifelse(DESeq_SwissProt$padj < 0.05 & abs(DESeq_SwissProt$log2FoldChange) > 10, DESeq_SwissProt$short_name, ""),
    xlim = c(min(DESeq_SwissProt[["log2FoldChange"]], na.rm = TRUE) - .5, max(DESeq_SwissProt[["log2FoldChange"]], na.rm = TRUE) + .5),
  ylim = c(0, max(-log10(DESeq_SwissProt[["padj"]]), na.rm = TRUE) + .5),
    labSize = 3,
    axisLabSize = 12,
    captionLabSize = 4,
    x = "log2FoldChange", 
    shape=16,
    y = "padj",
    pCutoff = 0.05,
    colCustom = LFC_colors, colAlpha = .7,
    title = NULL, subtitle = NULL, 
    legendPosition = "bottom",
    legendLabSize = 8,
    pointSize = 1,
    legendIconSize = 3,
    drawConnectors = TRUE,
    widthConnectors = 0.5,
    directionConnectors = "x",
    border = 'full',
    max.overlaps = Inf,
    maxoverlapsConnectors = 0,
    arrowheads =FALSE,
    min.segment.length = 0,
    gridlines.major = FALSE,
  gridlines.minor = FALSE
) #+ coord_flip() + scale_x_reverse()

plot

save_ggplot(plot, "Volcano_simple", width = 10, height = 5.25)

#DESeq_SwissProt_categorized_collapsed

Updating Renv environment:

After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.

renv::snapshot()